import numpy as np
import pandas as pd
import copy
import geopandas as gpd
import plotly.graph_objs as go
import plotly as py
from plotly.offline import init_notebook_mode, iplot
import matplotlib.pyplot as plt
from scipy.stats import probplot
from scipy.stats import ttest_1samp
import warnings
warnings.filterwarnings('ignore')
init_notebook_mode()
%matplotlib inline
plt.rcParams["figure.figsize"] = (10,5)
Names of U.S. jurisdictions (states) and their abbreviation are organized into a DataFrame. The abbreviation will be used as geological labels for visualize data in the form of a map.
states = pd.read_excel("./messy_data/states.xlsx")
states.head()
Delete District of Columbia and add an abbraviation FED for Federal because DC is underneath the Federal jurisdiction.
states.drop(states.index[states.Abbreviation == "DC"], inplace=True)
states.columns = ["Jurisdiction", "Jurisdiction Abbreviation"]
states = states.append({"Jurisdiction" : "Federal", "Jurisdiction Abbreviation" : "FED"}, ignore_index=True)
states.tail()
The U.S. Census Bureau divides the country into 4 regions -- Northeast, Midwest, South and West. Each region is futhre divided into several divisions.
regions = pd.read_excel("./messy_data/state_region.xlsx")
regions.head()
Drop District of Columbia.
regions.drop(regions.index[regions.State == "District of Columbia"], inplace=True)
usMap = gpd.read_file("./us_states_map.json")
usMap.head()
Drop District of Columbia and Puerto Rico.
usMap = usMap.loc[list(map(lambda x: x not in ["Puerto Rico", "District of Columbia"], usMap.NAME))]
Add state abbreviation and region and division to the geo-dataframe.
usMap = usMap.merge(states, left_on="NAME", right_on = "Jurisdiction", how="left").drop("Jurisdiction", axis=1)
usMap.columns = ["GEO_ID", "STATE", "NAME", "LSAD", "CENSUSAREA", "geometry", "Abbreviation"]
usMap = usMap.merge(regions, left_on="NAME", right_on = "State", how="left").drop("State", axis=1)
usMap.head()
malePrisonerPopulation = pd.read_excel("./messy_data/Prisoners under the jurisdiction of state or federal correctional authorities 1978-2016.xlsx", sheet_name="Male", header=9, nrows=54).dropna(1, "all").dropna(0)
femalePrisonerPopulation = pd.read_excel("./messy_data/Prisoners under the jurisdiction of state or federal correctional authorities 1978-2016.xlsx", sheet_name="Female", header=9, nrows=54).dropna(1, "all").dropna(0)
totalPrisonerPopulation = pd.read_excel("./messy_data/Prisoners under the jurisdiction of state or federal correctional authorities 1978-2016.xlsx", sheet_name="Total", header=9, nrows=54).dropna(1, "all").dropna(0)
totalPrisonerPopulation.head()
def cleanAndMeltPopulationTable(table, states):
# add DC data to Federal, and delete it
table.loc[table.Jurisdiction == "Federal"].iloc[:, 1:] = table.loc[table.Jurisdiction == "Federal"].iloc[:, 1:] + table.loc[table.Jurisdiction == "District of Columbia"].iloc[:, 1:].replace("--", 0)
table.drop(table.index[table.Jurisdiction == "District of Columbia"], inplace=True)
table = table.melt(id_vars = "Jurisdiction", var_name = "Year", value_name = "Population")
table.Year = table.Year.astype(int)
table.Population = table.Population.astype(int)
table = states.merge(table, on="Jurisdiction")
return table
Originally, District of Columbia was treated as an independent jurisdiction in Bureau of Justice Statistics's survey. However, as of 2001, it is being considered as part of the federal jurisdiction. This created some missing values for the District of Columbia data and the federal data starting in 2001. To correct for the inconsistency, the District of Comubia data prior to 2001 is added into the federal prisoner population during data cleaning.
meltedMalePrisonerPopulation = cleanAndMeltPopulationTable(malePrisonerPopulation, states)
meltedFemalePrisonerPopulation = cleanAndMeltPopulationTable(femalePrisonerPopulation, states)
meltedTotalPrisonerPopulation = cleanAndMeltPopulationTable(totalPrisonerPopulation, states)
meltedTotalPrisonerPopulation.head()
def aggregateRegionalPopulationSum(table, regions):
table = regions.merge(table, left_on="State", right_on="Jurisdiction", how="right")
temp_index = table["Jurisdiction"] == "Federal"
table.loc[temp_index, "Region"] = "Federal"
table.loc[temp_index, "Division"] = "Federal"
regionSum = table.groupby(["Year", "Region"]).Population.sum()
divisionSum = table.groupby(["Year", "Region", "Division"]).Population.sum()
return regionSum, divisionSum
malePrisonerPopulationRegionSum, malePrisonerPopulationDivisionSum = aggregateRegionalPopulationSum(meltedMalePrisonerPopulation, regions)
femalePrisonerPopulationRegionSum, femalePrisonerPopulationDivisionSum = aggregateRegionalPopulationSum(meltedFemalePrisonerPopulation, regions)
totalPrisonerPopulationRegionSum, totalPrisonerPopulationDivisionSum = aggregateRegionalPopulationSum(meltedTotalPrisonerPopulation, regions)
def plotAnnualStatePopulation(table, gender, years_chosen):
table = table[table.Jurisdiction != "Federal"]
cmax = table.Population.max()
cmin = table.Population.min()
years = table.Year.unique()
heatMapData = [{"type" : "choropleth", "locations" : table.loc[table.Year == year, "Jurisdiction Abbreviation"], "locationmode" : "USA-states", "colorscale" : "Viridis", "zmin" : cmin, "zmax" : cmax, "z" : table.Population[table.Year == year].astype(float)} for year in years]
mapLayout = [{"geo" : {"scope" : 'usa'}, "title" : gender + " Prisoner Population by State, " + str(year)} for year in years]
for i, data in enumerate(heatMapData):
if years[i] in years_chosen:
usHeatMap = go.Figure(data=[data], layout=mapLayout[i])
iplot(usHeatMap)
# gender = "Total"
# cmax = table.Population.max()
# cmin = table.Population.min()
# heatMapData = [dict(type = "choropleth", locations=table.loc[table.Year == year, "Jurisdiction Abbreviation"], locationmode="USA-states", colorscale="Viridis", zmin=cmin, zmax=cmax, z=table.Population[table.Year == year].astype(float), text = str(year), geo = 'geo'+str(i+1) if i != 0 else 'geo') for i, year in enumerate(years)]
# mapLayout = {"geo" + (str(i + 1) if i != 0 else "" ) : {"scope" : 'usa', "domain" : dict(x=[0, 1], y=[i / len(years), (i + 1) / len(years)])} for i in range(len(years))}
# fig = {'data' : heatMapData, 'layout' : mapLayout}
# iplot(fig)
plotAnnualStatePopulation(meltedMalePrisonerPopulation, "Male", [1978, 2016])
plotAnnualStatePopulation(meltedFemalePrisonerPopulation, "Female", [1978, 2016])
plotAnnualStatePopulation(meltedTotalPrisonerPopulation, "Total", [1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015])
The prisoner population in most jurisdictions has been increasing in both the male and female systems considerably since 1987. The elevation is especially prominent in California, Texas and Florida.
def plotGeoPandasUSMap(usMap, title=None, dataColumn=None, dataLimit=(None, None)):
fig, mapAx = plt.subplots(1, 1)
mapLimitW, mapLimitE, mapLimitS, mapLimitN = -185, -65, 15, 75
mapAx.axis((mapLimitW, mapLimitE, mapLimitS, mapLimitN))
mapAx.axis('off')
mapAx.set_aspect('equal', 'box')
mapAx.set_title(title)
usMap.plot(column=dataColumn, edgecolor="k", ax=mapAx, legend=True, vmin=dataLimit[0], vmax=dataLimit[1]) #, figsize=(20,20))
# sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=plt.Normalize(vmin=dataLimit[0], vmax=dataLimit[1]))
# sm._A = []
# cbar = mapAx.colorbar(sm)
def plotRegionalPopulation(table, usMap, gender, years_chosen):
regionType = table.index.names[1]
tableWOFederal = table[table.index.get_level_values(1) != "Federal"]
years = table.index.get_level_values(0).unique()
dividedTableWOFederal = [tableWOFederal[year] for year in years]
usRegionMap = usMap[[regionType, "geometry"]].dissolve(regionType)
colorlimit = (tableWOFederal.min(), tableWOFederal.max())
for i, oneYearTable in enumerate(dividedTableWOFederal):
year = years[i]
if year in years_chosen:
oneYearTable = usRegionMap.join(oneYearTable)
figtitle = gender + " Prisoner Population by " + regionType + ", " + str(year)
plotGeoPandasUSMap(oneYearTable, figtitle, "Population", colorlimit)
figtitle = gender + " Prisoner Population by " + regionType
ax = table.unstack().plot(title = figtitle)
ax.set_ylim(bottom=0)
ax.tick_params(axis = "y", length=0)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), fancybox=True, shadow=True, ncol=len(table.index.get_level_values(1).unique()))
plotRegionalPopulation(malePrisonerPopulationRegionSum, usMap, "Male", [1978, 2016])
plotRegionalPopulation(femalePrisonerPopulationRegionSum, usMap, "Female", [1978, 2016])
plotRegionalPopulation(totalPrisonerPopulationRegionSum, usMap, "Total", [1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015])
def dropRegionIndex(table):
table = copy.deepcopy(table)
table.index = table.index.droplevel(1)
return table
plotRegionalPopulation(dropRegionIndex(malePrisonerPopulationDivisionSum), usMap, "Male", [1978, 2016])
plotRegionalPopulation(dropRegionIndex(femalePrisonerPopulationDivisionSum), usMap, "Female", [1978, 2016])
plotRegionalPopulation(dropRegionIndex(totalPrisonerPopulationDivisionSum), usMap, "Total", [1980, 1985, 1990, 1995, 2000, 2005, 2010, 2015])
The prisoner population has been growing the fastest in the South. It is mostly driven by the rapid increase in South Atlantic and South West Central due to growth in Florida and Texas, respectively.
custodyPopulation_11_16 = pd.read_excel("./messy_data/Prison occupation rate.xlsx", "Custody Population", na_values="/")
occupancyLowRate_11_16 = pd.read_excel("./messy_data/Prison occupation rate.xlsx", "Highest Capacity", na_values="/")
occupancyHighRate_11_16 = pd.read_excel("./messy_data/Prison occupation rate.xlsx", "Lowest Capacity", na_values="/")
custodyPopulation_11_16.head()
Two occupancy rates were reported -- low estimation and high estimation
occupancyHighRate_11_16.head()
occupancyHighRate_11_16.head()
def cleanAndMeltCustodyOccupancyTable(table, states, tableType):
table = table.melt(id_vars = "Jurisdiction", var_name = "Year", value_name = tableType)
if tableType == "Occupancy":
table.Occupancy = table.Occupancy / 100
table.Year = table.Year.astype(int)
table = states.merge(table, on="Jurisdiction")
return table
meltedCustodyPopulation_11_16 = cleanAndMeltCustodyOccupancyTable(custodyPopulation_11_16, states, "Population")
meltedOccupancyLowRate_11_16 = cleanAndMeltCustodyOccupancyTable(occupancyLowRate_11_16, states, "Occupancy")
meltedOccupancyHighRate_11_16 = cleanAndMeltCustodyOccupancyTable(occupancyHighRate_11_16, states, "Occupancy")
meltedCustodyPopulation_11_16.head()
def plotAnnualStateOccupancy(table, low_or_high, years_chosen):
table = table[table.Occupancy != "Federal"]
cmax = table.Occupancy.max()
cmin = table.Occupancy.min()
years = table.Year.unique()
heatMapData = [{"type" : "choropleth", "locations" : table.loc[table.Year == year, "Jurisdiction Abbreviation"], "locationmode" : "USA-states", "colorscale" : "Viridis", "zmin" : cmin, "zmax" : cmax, "z" : table.Occupancy[table.Year == year].astype(float)} for year in years]
mapLayout = [{"geo" : {"scope" : 'usa'}, "title" : "Prison Occupancy (" + low_or_high + " Estimation) by State, " + str(year)} for year in years]
for i, data in enumerate(heatMapData):
if years[i] in years_chosen:
usHeatMap = go.Figure(data=[data], layout=mapLayout[i])
iplot(usHeatMap)
plotAnnualStateOccupancy(meltedOccupancyLowRate_11_16, "Low", [2011, 2016])
plotAnnualStateOccupancy(meltedOccupancyHighRate_11_16, "High", [2011, 2016])
The data suggest during the second decade of the 21st century, the occupancy rate of U.S. prisons are reducing. This is especially true for some states such as California. However, for many states, their prisons are still operating over 100% of its capacity.
def aggregateRegionalOccupancy(table, populationTable, regions):
table = regions.merge(table.dropna(), left_on="State", right_on="Jurisdiction", how="right")
table = table.merge(populationTable[["Jurisdiction", "Year", "Population"]].dropna(), on=["Jurisdiction", "Year"], how="left")
temp_index = table["Jurisdiction"] == "Federal"
table.loc[temp_index, "Region"] = "Federal"
table.loc[temp_index, "Division"] = "Federal"
table["Capacity"] = table.Population / table.Occupancy
regionCapacity = table.groupby(["Year", "Region"]).Capacity.sum()
divisionCapacity = table.groupby(["Year", "Region", "Division"]).Capacity.sum()
regionPopulation = table.groupby(["Year", "Region"]).Population.sum()
divisionPopulation = table.groupby(["Year", "Region", "Division"]).Population.sum()
regionOccupancy = (regionPopulation / regionCapacity).rename("Occupancy")
divisionOccupancy = (divisionPopulation / divisionCapacity).rename("Occupancy")
return regionOccupancy, divisionOccupancy
regionLowOccupancy, divisionLowOccupancy = aggregateRegionalOccupancy(meltedOccupancyLowRate_11_16, meltedCustodyPopulation_11_16, regions)
regionHighOccupancy, divisionHighOccupancy = aggregateRegionalOccupancy(meltedOccupancyHighRate_11_16, meltedCustodyPopulation_11_16, regions)
def plotRegionalOccupancy(table, usMap, low_or_high, years_chosen):
regionType = table.index.names[1]
tableWOFederal = table[table.index.get_level_values(1) != "Federal"]
years = table.index.get_level_values(0).unique()
dividedTableWOFederal = [tableWOFederal[year] for year in years]
usRegionMap = usMap[[regionType, "geometry"]].dissolve(regionType)
colorlimit = (tableWOFederal.min(), tableWOFederal.max())
for i, oneYearTable in enumerate(dividedTableWOFederal):
year = years[i]
if year in years_chosen:
oneYearTable = usRegionMap.join(oneYearTable)
figtitle = "Prison Occupancy (" + low_or_high + " Estimate) by " + regionType + ", " + str(year)
plotGeoPandasUSMap(oneYearTable, figtitle, "Occupancy", colorlimit)
figtitle = "Prison Occupancy (" + low_or_high + " Estimate) by " + regionType
ax = table.unstack().plot(title = figtitle)
ax.tick_params(axis = "y", length=0)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), fancybox=True, shadow=True, ncol=len(table.index.get_level_values(1).unique()))
plotRegionalOccupancy(regionLowOccupancy, usMap, "Low", [2011, 2016])
plotRegionalOccupancy(regionHighOccupancy, usMap, "High", [2011, 2016])
Currently, the prisons in the Northeast and the South are running below their full capacity. Although the Federal correction facilities are still over-crowded, its occupancy rate has dropped significantly during the second decade of this century.
plotRegionalOccupancy(dropRegionIndex(divisionLowOccupancy), usMap, "Low", [2011, 2016])
plotRegionalOccupancy(dropRegionIndex(divisionHighOccupancy), usMap, "High", [2011, 2016])
The prison system in the Mountain, Mid-Atlantic, Pacific, East South Central, South Atlantic and West South Central divisions is not over-filled at the moment. Although is prison occupancy rate is very high in the Pacific, it is decreasing rapidly.
totalAdmission = pd.read_excel("./messy_data/admissions.xlsx", sheet_name="Total").dropna().replace("/", np.nan)
totalAdmission.head()
maleAdmission = pd.read_excel("./messy_data/admissions.xlsx", sheet_name="Male").dropna().replace("/", np.nan)
femaleAdmission = pd.read_excel("./messy_data/admissions.xlsx", sheet_name="Female").dropna().replace("/", np.nan)
def cleanAndMeltCountTable(table, states):
# add DC data to Federal, and delete it
table.loc[table.Jurisdiction == "Federal"].iloc[:, 1:] = table.loc[table.Jurisdiction == "Federal"].iloc[:, 1:] + table.loc[table.Jurisdiction == "District of Columbia"].iloc[:, 1:].replace("--", 0)
table.drop(table.index[table.Jurisdiction == "District of Columbia"], inplace=True)
table = table.melt(id_vars = "Jurisdiction", var_name = "Year", value_name = "Count")
table.Year = table.Year.astype(int)
table.Count = pd.to_numeric(table.Count)
table = states.merge(table, on="Jurisdiction")
return table
The inconsistency that involves the District of Columbia and federal numbers, as pointed out for the population data above, also exists for the annual admission and release data. It is corrected in the same fashion as mentioned before.
meltedTotalAdmission = cleanAndMeltCountTable(totalAdmission, states)
meltedMaleAdmission = cleanAndMeltCountTable(maleAdmission, states)
meltedFemaleAdmission = cleanAndMeltCountTable(femaleAdmission, states)
meltedTotalAdmission.head()
totalRelease = pd.read_excel("./messy_data/releases.xlsx", sheet_name="Total").dropna().replace("/", np.nan)
maleRelease = pd.read_excel("./messy_data/releases.xlsx", sheet_name="Male").dropna().replace("/", np.nan)
femaleRelease = pd.read_excel("./messy_data/releases.xlsx", sheet_name="Female").dropna().replace("/", np.nan)
meltedTotalRelease = cleanAndMeltCountTable(totalRelease, states)
meltedMaleRelease = cleanAndMeltCountTable(maleRelease, states)
meltedFemaleRelease = cleanAndMeltCountTable(femaleRelease, states)
admission = meltedTotalAdmission
release = meltedTotalRelease
population = meltedTotalPrisonerPopulation
release.columns = ['Jurisdiction', 'Jurisdiction Abbreviation', 'Year', 'Release']
admission.columns = ['Jurisdiction', 'Jurisdiction Abbreviation', 'Year', 'Admission']
combinedTable = population.merge(admission, on=['Jurisdiction', 'Jurisdiction Abbreviation', 'Year']).merge(release, on=['Jurisdiction', 'Jurisdiction Abbreviation', 'Year'])
mean_error = {}
epsilon = np.array([])
ttestP = {}
for state in combinedTable.Jurisdiction.unique():
table1State = combinedTable[combinedTable.Jurisdiction == state][["Year", "Population", "Admission", "Release"]]
table1State = table1State.interpolate() # linear interpolation to fill in missing data
table1State.set_index("Year", inplace = True)
a_minus_r = table1State.Admission - table1State.Release
difference = table1State.Population.diff()
if state == "Federal":
a_minus_r.plot(label="admission - release")
difference.plot(label="population increment")
plt.ylabel("head count")
plt.title("Discrepancy between annual population increment and difference between admission and release (Federal)")
plt.legend()
error = difference - a_minus_r
mean_error[state] = error.mean()
proportional_error = error[1:].values / table1State.Population[: -1].values
epsilon = np.append(epsilon, proportional_error)
_, ttestP[state] = ttest_1samp(proportional_error, 0)
Theoretically, the annual increase in the prisoner population should match the difference between number of people admitted and released in a single year. $$p[t] - p[t-1] = a[t] - r[t]$$ $p[t]$ -- population, $a[t]$ -- admission, $r[t]$ -- release
However, as the figure Discrepancy between annual population increment and difference between admission and release (Federal) indicates there is some discrepancy in the data.
(Note: some jurisdiction also failed to report their data during certain years. The corresponding missing values are interpolated linearly.)
mean_error = pd.Series(mean_error)
mean_error.index.name = "Jurisdiction"
mean_error.name = "Mean Error"
mean_error.sort_index(inplace=True)
mean_population = population[["Jurisdiction", "Population"]].groupby("Jurisdiction").Population.sum()
mean_population.name = "Mean Population"
proportion = mean_error / mean_population
proportion.name = "Proportion"
discrepancyTable = pd.concat([mean_error, mean_population, proportion], axis=1)
discrepancyTable
print("SD/MEAN (error): " + str(discrepancyTable["Mean Error"].std() / discrepancyTable["Mean Error"].mean()))
print("SD/MEAN (proportion): " + str(discrepancyTable["Proportion"].std() / discrepancyTable["Proportion"].mean()))
The data suggest the dynamic of population increment shuold include a error term. $$p[t] - p[t-1] = a[t] - r[t] + Err$$ In the table above, the column Mean Error shows the average discrepancy for each state, which seems to have a lot of variance. However, the ratio of the average discrepancy to mean population across the years, Proportion, has a significanly smaller variance -- SD/MEAN ratio for the proportion is a lot less than that for mean error. This suggests it is better to quantify the error term proportional to population.
plt.hist(epsilon, bins=100)
plt.xlim([-0.25, 0.25])
plt.xlabel("proportional error")
plt.ylabel("frequency")
plt.title("proportional error of dynamic discrepancy")
ttestP = pd.Series(ttestP)
ttestP.index.name = "Jurisdiction"
ttestP.name = "p"
alpha = 0.05
significant = ttestP < alpha / ttestP.shape[0]
significant.name = "significant"
tTestResult = pd.concat([ttestP, significant], axis=1)
tTestResult
The histogram shows $\epsilon$ is very likely to be centered at 0. A t-test with the null hypothesis that for each state $E[\epsilon] = 0$ suggests for most states, the null hypothesis should not be rejected. (Note: due to multiple test are being run, the significance level was adjusted.)
Thus, if a model is to be built to delineate the temporal dynamics of population, admission and release. $$p[t] - p[t-1] = a[t] - r[t] + \epsilon p[t-1], E[\epsilon] = 0$$ will be a good choice.